!conda install mlxtend --yes
import math
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
df = pd.read_csv('allstate.csv')
df.shape
df.dtypes
df.head()
pandas_profiling.ProfileReport(df[0:5000].copy())
data = df[0:5000].copy()
y = data['loss']
X = data.drop(columns=['loss', 'id'])
X = X.fillna('MISSING')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=251)
catColumns = X_train.select_dtypes(include='object').columns.tolist()
numColumns = X_train.select_dtypes(include='number').columns.tolist()
X_train_num = X_train[numColumns]
X_test_num = X_test[numColumns]
X_train_cat = X_train[catColumns]
X_test_cat = X_test[catColumns]
scaler = StandardScaler()
ohe = OneHotCategoricalEncoder(variables=catColumns)
X_train_num = pd.DataFrame(scaler.fit_transform(X_train_num), columns = X_train_num.columns)
X_train_num = X_train_num.reset_index(drop=True)
X_test_num = pd.DataFrame(scaler.transform(X_test_num), columns = X_train_num.columns)
X_test_num = X_test_num.reset_index(drop=True)
X_train_cat = ohe.fit_transform(X_train_cat)
X_train_cat = X_train_cat.reset_index(drop=True)
X_test_cat = ohe.transform(X_test_cat)
X_test_cat = X_test_cat.reset_index(drop=True)
X_train = pd.concat([X_train_num, X_train_cat], axis=1)
X_test = pd.concat([X_test_num, X_test_cat], axis=1)
def evaluate(truth, pred):
mae = mean_absolute_error(truth, pred)
mse = mean_squared_error(truth, pred)
rmse = math.sqrt(mse)
return (mae, mse, rmse)
resultsDF = pd.DataFrame(columns=['Technique', 'Method', 'MAE', 'RMSE', 'Features'])
rf = RandomForestRegressor(random_state=1337)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Baseline'] = ['None', 'Baseline', mae, rmse, X_train.shape[1]]
corr_matrix = X_train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
features_to_remove = [column for column in upper.columns if any(upper[column] > 0.95)]
print(features_to_remove) # Note 0.95 > Cont9 > 0.90
X_train_new = X_train.drop(columns=features_to_remove, axis=1)
X_test_new = X_test.drop(columns=features_to_remove, axis=1)
# Predict & Score
rf = RandomForestRegressor(random_state=202)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Filter:Correlated'] = ['Filter', 'Correlated', mae, rmse, X_train_new.shape[1]]
selector = SelectPercentile(f_regression, percentile=10)
selector.fit(X_train, y_train)
X_train_new = selector.transform(X_train)
X_test_new = selector.transform(X_test)
cols = X_train.columns[selector.get_support()]
pvals = selector.pvalues_[selector.get_support()]
scores = -np.log10(pvals)
scores /= scores.max()
featuresFRegDF = pd.DataFrame({'Column': cols, 'P-Value': pvals, 'Score': scores})
featuresFRegDF.sort_values('Score', ascending=False)
featuresFRegDF = featuresFRegDF.reset_index(drop=True)
featuresFRegDF['Rank'] = featuresFRegDF.index + 1
featuresFRegDF.head()
# Predict & Score
rf = RandomForestRegressor(random_state=999)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Filter:F-Score'] = ['Filter', 'F-Score', mae, rmse, X_train_new.shape[1]]
selector = SelectPercentile(mutual_info_regression, percentile=10)
selector.fit(X_train, y_train)
X_train_new = selector.transform(X_train)
X_test_new = selector.transform(X_test)
cols = X_train.columns[selector.get_support()]
scores = selector.scores_[selector.get_support()]
scores /= scores.max()
featuresMutDF = pd.DataFrame({'Column': cols, 'Score': scores})
featuresMutDF.sort_values('Score', ascending=False)
featuresMutDF = featuresMutDF.reset_index(drop=True)
featuresMutDF['Rank'] = featuresMutDF.index + 1
featuresMutDF.head()
# Predict & Score
rf = RandomForestRegressor()
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Filter:MutualInformation'] = ['Filter', 'Mutual Information', mae, rmse, X_train_new.shape[1]]
combineRegDF = featuresFRegDF[['Column', 'Score', 'Rank']]
combineRegDF = combineRegDF.rename(columns={'Score': 'FReg:Score', 'Rank':'FReg:Rank'})
combineMutDF = featuresMutDF[['Column', 'Score', 'Rank']]
combineMutDF = combineMutDF.rename(columns={'Score': 'Mut:Score', 'Rank':'Mut:Rank'})
featuresDF = pd.merge(left=combineRegDF, right=combineMutDF, how='outer',
left_on='Column', right_on='Column')
featuresDF
selector = VarianceThreshold()
selector.fit(X_train)
selector.get_support()
X_train_new = selector.transform(X_train)
X_test_new = selector.transform(X_test)
# Predict & Score
rf = RandomForestRegressor(random_state=1337)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Filter:VarianceThreshold'] = ['Filter', 'VarianceThreshold', mae, rmse, X_train_new.shape[1]]
lassoReg = Lasso(fit_intercept=False)
lassoReg.fit(X_train, y_train)
coefficients = lassoReg.coef_
support = coefficients > 0
cols = X_train.columns[support]
X_train_new = X_train[cols]
X_test_new = X_test[cols]
# Predict & Score
rf = RandomForestRegressor(random_state=999)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Embedded:Lasso'] = ['Embedded', 'Lasso', mae, rmse, X_train_new.shape[1]]
estimator = RandomForestRegressor(random_state=1337)
selector = SelectFromModel(estimator, )
selector.fit(X_train, y_train)
cols = X_train.columns[(selector.get_support())]
X_train_new = X_train[cols]
X_test_new = X_test[cols]
# Predict & Score
rf = RandomForestRegressor(random_state=999)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Embedded:RandomForest'] = ['Embedded', 'RandomForest', mae, rmse, X_train_new.shape[1]]
features = {}
for feature, importance in zip(X_train_new.columns, rf.feature_importances_):
features[feature] = importance
importanceDF = pd.DataFrame.from_dict(features, orient='index')
importanceDF.sort_values(by=0, ascending=False).head(10)
importanceDF.sort_values(by=0, ascending=False)[0:25].plot(kind='bar', rot=90, figsize=(10, 8))
plt.title('Top 25 Important Features')
estimator = SVR(kernel="linear")
selector = RFE(estimator, 80, step=10, verbose=1)
selector = selector.fit(X_train, y_train)
X_train_new = selector.transform(X_train)
X_test_new = selector.transform(X_test)
# Predict & Score
rf = RandomForestRegressor(random_state=999)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Wrapper:RFE'] = ['Wrapper', 'RFE', mae, rmse, X_train_new.shape[1]]
estimator = LinearRegression()
sfs = SFS(estimator,
k_features=80,
forward=True,
floating=False,
verbose=2,
scoring='neg_mean_squared_error',
n_jobs=-1,
cv=0)
sfsModel = sfs.fit(X_train, y_train)
plot_sfs(sfsModel.get_metric_dict(), figsize=(10,8))
plt.title('Sequential Forward Selection')
plt.grid()
plt.show()
features = list(sfsModel.subsets_[80]['feature_names'])
X_train_new = X_train[features]
X_test_new = X_test[features]
# Predict & Score
rf = RandomForestRegressor(random_state=999)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Wrapper:SFS'] = ['Wrapper', 'SFS', mae, rmse, X_train_new.shape[1]]
estimator = LinearRegression()
sffs = SFS(estimator,
k_features=80,
forward=True,
floating=True,
verbose=2,
scoring='neg_mean_squared_error',
n_jobs=-1,
cv=0)
sffsModel = sffs.fit(X_train, y_train)
features = list(sffsModel.subsets_[80]['feature_names'])
X_train_new = X_train[features]
X_test_new = X_test[features]
# Predict & Score
rf = RandomForestRegressor(random_state=999)
rf.fit(X_train_new, y_train)
pred = rf.predict(X_test_new)
mae, mse, rmse = evaluate(y_test, pred)
print('MAE : {:.2f}'.format(mae))
print('RMSE : {:.2f}'.format(rmse))
resultsDF.loc['Wrapper:SFFS'] = ['Wrapper', 'SFFS', mae, rmse, X_train_new.shape[1]]
resultsDF.sort_values('MAE', ascending=True)